Data
Input
Output from bedtools coverage: - chromosome - region (upstream/downstream of gene, intron, or exon) start - region end - number of TEs that overlapped a region by at least one bp - number of bases in the region with coverage from a TE - region length - fraction of bases in the region with coverage from a TE
Created
windowAndMerge output variables: - n :: the number of elements in the window (upNum=downNum unless at the beginning/end of the chromosome) - sum :: the number of TEs in elements in the window - len :: the total number of TE bp in regions within the window - avg :: the mean frequency of TEs in elements in the window
Import & Analysis
Functions and variables
winsize = 1000000
# Column Names
colIDs <- c("scaff","source","seqOnt","start","end","score","strand","phase",
"attr", "count", "TElen", "eleLen", "density")
colsKeep <- c("scaff", "start", "end", "strand", "attr", "count", "TElen",
"eleLen", "density")
# Scaffolds (too many scaffs leads to plotting issues)
#NCfemScaffs <- c("A1", "A2", "A4", "X_NeoX")
H1scaffs <- c("A1", "A2", "A4", "X")
H2scaffs <- c("A1", "A2", "A4", "Y1", "Y2")
# Division points for the sex chromosome regions
#Make a list of dataframes for each region
importFiles <- function(name) {
# Create file list based on expected dir layout
upstreamTEsFile <- paste("data/",name,".TEsUpstream", sep="")
downstreamTEsFile <- paste("data/",name,".TEsDownstream", sep="")
exonTEsFile <- paste("data/",name,".TEsExons", sep="")
intronTEsFile <- paste("data/",name,".TEsIntrons", sep="")
# Actually import the files with those names
upstreamTEs <- read_tsv(upstreamTEsFile, col_names = colIDs) %>%
select(all_of(colsKeep)) #%>%
#rename(upCount = count, upDensity = density)
downstreamTEs <- read_tsv(downstreamTEsFile, col_names = colIDs) %>%
select(all_of(colsKeep)) #%>%
#rename(downCount = count, downDensity = density)
exonTEs <- read_tsv(exonTEsFile, col_names = colIDs) %>%
select(all_of(colsKeep)) #%>%
#rename(exonCount = count, exonDensity = density)
intronTEs <- read_tsv(intronTEsFile, col_names = colIDs) %>%
select(all_of(colsKeep)) #%>%
#rename(intronCount = count, intronDensity = density)
return( list(upstreamTEs, downstreamTEs, exonTEs, intronTEs) )
}
Window the data so itās a reasonable size for plotting
windowAndMerge <- function(dfList, scaffList) {
upTEs <- dfList[[1]] %>%
filter(scaff %in% scaffList) %>%
mutate(winnum = start%/%as.numeric(winsize)) %>%
group_by(scaff, winnum) %>%
summarise(upSize = sum(eleLen), upSum = sum(count), upNum = n(),
upLen = sum(TElen), upAvg = mean(density))
downTEs <- dfList[[2]] %>%
filter(scaff %in% scaffList) %>%
mutate(winnum = start%/%as.numeric(winsize)) %>%
group_by(scaff, winnum) %>%
summarise(downSize = sum(eleLen), downSum = sum(count), downNum = n(),
downLen = sum(TElen), downAvg = mean(density))
exTEs <- dfList[[3]] %>%
filter(scaff %in% scaffList) %>%
mutate(winnum = start%/%as.numeric(winsize)) %>%
group_by(scaff, winnum) %>%
summarise(exonSize = sum(eleLen), exonSum = sum(count), exonNum = n(),
exonLen = sum(TElen), exonAvg = mean(density))
inTEs <- dfList[[4]] %>%
filter(scaff %in% scaffList) %>%
mutate(winnum = start%/%as.numeric(winsize)) %>%
group_by(scaff, winnum) %>%
summarise(intronSize = sum(eleLen), intronSum = sum(count), intronNum = n(),
intronLen = sum(TElen), intronAvg = mean(density))
allTogether <- full_join(upTEs, downTEs) %>%
full_join(exTEs) %>%
full_join(inTEs)
#returnCounts <- allTogether %>%
# pivot_longer(c(upSum, downSum, exonSum, intronSum),
# names_to = "AreaC", values_to = "sum") %>%
# pivot_longer(c(upAvg, downAvg, exonAvg, intronAvg),
# names_to = "AreaA", values_to = "avg") %>%
# pivot_longer(c(upNum, downNum, exonNum, intronNum),
# names_to = "AreaN", values_to = "count")
return(allTogether)
}
Merge the H1 and H2 data by gene orthology
robustMin <- function(x) {if (length(x)>0) min(x) else NA}
genewise <- function(H1list, H2list, pairList) {
{
H1tibble <- H1list %>%
# Flatten the list of tibbles, adding a new row to specify location
list_rbind(, names_to = "location") %>%
# Only use the chromosome-scale scaffolds
filter(scaff %in% H1scaffs) %>%
# There are duplicates because of the ortholog list formatting
distinct() %>%
# Rename the location var so the values are meaningful & extract the
#gene name from the attribute column to match with pair list
mutate(location = case_when(location == 1 ~ "upstream",
location == 2 ~ "downstream",
location == 3 ~ "intron",
location == 4 ~ "exon"),
H1 = str_split_i(
str_match(attr, "^ID=(.+?);")[,2],
pattern = "-", 1) ) %>%
# Keep only the useful columns (don't need end, strand, or attr)
select(location, scaff, start, count:density, H1) %>%
# Summarise the exon/intron data per gene
group_by(location, scaff, H1) %>%
summarize(start = robustMin(start), count = sum(count),
TElen = sum(TElen), eleLen = sum(eleLen),
frequency = mean(density), .groups = "keep") %>%
# Find the H2 gene match for H1 orthologs (and don't keep any non-orthologs)
inner_join(pairList, by = join_by(H1), multiple = "all")
} # Make the H1 tibble (documented well)
{
H2tibble <- H2list %>%
list_rbind(, names_to = "location") %>%
filter(scaff %in% H2scaffs) %>%
distinct() %>%
mutate(location = case_when(location == 1 ~ "upstream",
location == 2 ~ "downstream",
location == 3 ~ "intron",
location == 4 ~ "exon"),
H2 = str_split_i(
str_match(attr, "^ID=(.+?);")[,2],
pattern = "-", 1) ) %>%
select(location, scaff, start, count:density, H2) %>%
group_by(location, scaff, H2) %>%
summarize(start = robustMin(start), count = sum(count),
TElen = sum(TElen), eleLen = sum(eleLen),
frequency = mean(density), .groups = "keep") %>%
inner_join(pairList, by = join_by(H2), multiple = "all")
} # Make the H2 tibble
# Join the two tibbles together
paired <- full_join(H1tibble, H2tibble, by = c("H1", "H2", "location"),
suffix = c("H1", "H2"), multiple = "all")
return(paired)
}
Import
Actually importing the data & summarizing it
# NC male Haplotype 1
H1list <- importFiles("orthoH1")
H1counts <- windowAndMerge(H1list, H1scaffs)
# NC male Haplotype 2
H2list <- importFiles("orthoH2")
H2counts <- windowAndMerge(H2list, H2scaffs)
# Gene pairing information
pairList <- read_tsv("data/pairs.clean", col_names = c("H1","H2")) %>%
#mutate(H1 = gsub("\\*", "", H1),
# H2 = gsub("\\*", "", H2)) %>%
distinct()
Looking at regions within the sex chromosomes
XregionTEs <- H1counts %>%
select(scaff, winnum, upLen, downLen,, exonLen, intronLen,
upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "X") %>%
mutate(region = case_when(
winnum < 71 ~ "PAR1",
winnum < 261 ~ "Old",
winnum < 367 ~ "New",
winnum >= 367 ~ "PAR2"
)) %>%
group_by(region) %>%
summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
# summarize(up = sum(upSize, na.rm=T),
# down = sum(downSize, na.rm=T),
# exon = sum(exonSize, na.rm=T),
# intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "X")
Y1regionTEs <- H2counts %>%
select(scaff, winnum, upLen, downLen,, exonLen, intronLen,
upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "Y1") %>%
mutate(region = case_when(
winnum < 64 ~ "New",
winnum < 272 ~ "Old",
winnum >= 272 ~ "PAR1"
))%>%
group_by(region) %>%
summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
# summarize(up = sum(upSize, na.rm=T),
# down = sum(downSize, na.rm=T),
# exon = sum(exonSize, na.rm=T),
# intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "Y1")
Y2regionTEs <- H2counts %>%
select(scaff, winnum, upLen, downLen,, exonLen, intronLen,
upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "Y2") %>%
mutate(region = case_when(
winnum < 121 ~ "PAR2",
winnum < 159 ~ "New",
winnum >= 159 ~ "Old"
)) %>%
group_by(region) %>%
summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
# summarize(up = sum(upSize, na.rm=T),
# down = sum(downSize, na.rm=T),
# exon = sum(exonSize, na.rm=T),
# intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "Y2")
allRegionTEs <- full_join(XregionTEs, Y1regionTEs,
by = join_by(region, location)) %>%
full_join(Y2regionTEs, by = join_by(region, location)) %>%
pivot_longer(cols = c(X, Y1, Y2),
names_to = "Chromosome", values_to = "Frequency")
Region āsizesā = sum of all element bps in the region
XregionSizes <- H1counts %>%
select(scaff, winnum, upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "X") %>%
mutate(region = case_when(
winnum < 71 ~ "PAR1",
winnum < 261 ~ "Old",
winnum < 367 ~ "New",
winnum >= 367 ~ "PAR2"
)) %>%
group_by(region) %>%
# summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
# down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
# exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
# intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
summarize(up = sum(upSize, na.rm=T),
down = sum(downSize, na.rm=T),
exon = sum(exonSize, na.rm=T),
intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "X")
Y1regionSizes <- H2counts %>%
select(scaff, winnum, upLen, downLen,, exonLen, intronLen,
upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "Y1") %>%
mutate(region = case_when(
winnum < 64 ~ "New",
winnum < 272 ~ "Old",
winnum >= 272 ~ "PAR1"
))%>%
group_by(region) %>%
# summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
# down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
# exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
# intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
summarize(up = sum(upSize, na.rm=T),
down = sum(downSize, na.rm=T),
exon = sum(exonSize, na.rm=T),
intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "Y1")
Y2regionSizes <- H2counts %>%
select(scaff, winnum, upLen, downLen,, exonLen, intronLen,
upSize, downSize, exonSize, intronSize) %>%
filter(scaff == "Y2") %>%
mutate(region = case_when(
winnum < 121 ~ "PAR2",
winnum < 159 ~ "New",
winnum >= 159 ~ "Old"
)) %>%
group_by(region) %>%
# summarize(up = sum(upLen, na.rm=T) / sum(upSize, na.rm=T),
# down = sum(downLen, na.rm=T) / sum(downSize, na.rm=T),
# exon = sum(exonLen, na.rm=T) / sum(exonSize, na.rm=T),
# intron = sum(intronLen, na.rm=T) / sum(intronSize, na.rm=T)) %>%
summarize(up = sum(upSize, na.rm=T),
down = sum(downSize, na.rm=T),
exon = sum(exonSize, na.rm=T),
intron = sum(intronSize, na.rm=T)) %>%
pivot_longer(cols = up:intron, names_to = "location", values_to = "Y2")
allRegionSizes <- full_join(XregionSizes, Y1regionSizes,
by = join_by(region, location)) %>%
full_join(Y2regionSizes, by = join_by(region, location)) %>%
pivot_longer(cols = c(X, Y1, Y2),
names_to = "Chromosome", values_to = "Size")
Per-Gene Comparisons
Want to look at differences per gene instead of in aggregate!
There are usually multiple rows for introns/exons per gene - want to summarise
pairedGenes <- genewise(H1list, H2list, pairList)
# Trying to figure out why NAs appear during the join - the input was supposed to include *only* H1:H2 orthologs...
# paired %>%
# filter(is.na(countH1)) #396 rows
# paired %>%
# filter(is.na(countH2)) #649 rows
#
# H1tibble %>%
# filter(is.na(H2)) #0
# H2tibble %>%
# filter(is.na(H1)) #0
If we just want to look at the genes with orthologs on X or Y1/Y2
mutate(region = case_when(
winnum < 71 ~ "PAR1",
winnum < 261 ~ "Old",
winnum < 367 ~ "New",
winnum >= 367 ~ "PAR2"
))
Visualization
Across Chromosomes
TE Frequency
Bedtools provides a TE frequency measurement (how much of the featureās length overlaps with an annotated TE), which I averaged within each 1Mb window.
H1counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upAvg, colour = "Upstream")) +
geom_line(aes(y = downAvg, colour = "Downstream")) +
geom_line(aes(y = exonAvg, colour = "Exon")) +
geom_line(aes(y = intronAvg, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H1 Position (1Mb windows)",
y = "Average TE Frequency in this Window",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
H2counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upAvg, colour = "Upstream")) +
geom_line(aes(y = downAvg, colour = "Downstream")) +
geom_line(aes(y = exonAvg, colour = "Exon")) +
geom_line(aes(y = intronAvg, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H2 Position (1Mb windows)",
y = "Average TE Frequency in this Window",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
Generally, the 1kb regions up/downstream of genes contain a larger proportion of TEs than introns or exons do. There are some interestingly TE-ridden introns, though, too.
TE Count
Bedtools overlap also provides the number of TEs in a given feature, which is slightly more relevant to my interest in recent insertions. I averaged this sum within each 1Mb window.
H1counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum, colour = "Upstream")) +
geom_line(aes(y = downSum, colour = "Downstream")) +
geom_line(aes(y = exonSum, colour = "Exon")) +
geom_line(aes(y = intronSum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H1 Position (1Mb windows)",
y = "Windowed TE count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
H2counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum, colour = "Upstream")) +
geom_line(aes(y = downSum, colour = "Downstream")) +
geom_line(aes(y = exonSum, colour = "Exon")) +
geom_line(aes(y = intronSum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H2 Position (1Mb windows)",
y = "Windowed TE count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
Occasional huge spikes in the number of TEs in H1 introns made things hard to interpret for the rest of the features.
H1counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum, colour = "Upstream")) +
geom_line(aes(y = downSum, colour = "Downstream")) +
geom_line(aes(y = exonSum, colour = "Exon")) +
# geom_line(aes(y = intronSum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H1 Position (1Mb windows)",
y = "Windowed TE count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
I also divided each windowās TE sum metric by the number of features in it (e.g.Ā the average number of intron TEs in the window is divided by the number of introns).
H1counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum/upNum, colour = "Upstream")) +
geom_line(aes(y = downSum/downNum, colour = "Downstream")) +
geom_line(aes(y = exonSum/exonNum, colour = "Exon")) +
# geom_line(aes(y = intronSum/intronNum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H1 Position (1Mb windows)",
y = "Windowed count divided by feature count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
H2counts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum/upNum, colour = "Upstream")) +
geom_line(aes(y = downSum/downNum, colour = "Downstream")) +
geom_line(aes(y = exonSum/exonNum, colour = "Exon")) +
# geom_line(aes(y = intronSum/intronNum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H2 Position (1Mb windows)",
y = "Windowed count divided by feature count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
With the unfiltered data set, there were an average of 2 TEs within 1kb of genes. With the filtered set, it looks about the same but with variation closer to what Iād expect (low levels in the regions of high gene density).
Introns are Weird
I thought this section was going to be unhelpful with the filtered data set, but this is definitely helping to look into whatās going on with the introns on the X chromosome.
H1counts %>%
select(c(scaff, winnum, intronNum, intronSum)) %>%
ggplot(aes(x = winnum)) +
geom_line(aes(y = intronNum, colour = "Introns")) +
geom_line(aes(y = intronSum, colour = "TEs in Introns")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H1 Position (1Mb windows)", y = "Average in Window",
colour = "Feature Type") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
H2counts %>%
select(c(scaff, winnum, intronNum, intronSum)) %>%
ggplot(aes(x = winnum)) +
geom_line(aes(y = intronNum, colour = "Introns")) +
geom_line(aes(y = intronSum, colour = "TEs in Introns")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
labs(x = "H2 Position (1Mb windows)", y = "Average in Window",
colour = "Feature Type") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
The H1 intron data fits really nicely with my expectations but I find H2 intron data very confusing. I donāt know why there are so many fewer introns in the H2 gene annotationā¦
Bar plots
Region Sizes
ggplot(allRegionSizes, aes(x = region, y = Size, fill = Chromosome)) +
geom_col(position = position_dodge()) +
pubTheme + scale_fill_manual(values = pubColours) +
labs(x = "Region", y = "Number of bp in this region",
fill = "Chromosome") +
facet_grid(vars(location), space = "free")
X chromosome introns seem to be much more frequent/larger for these 1:1 orthologs.
Divisions within sex chromosomes
ggplot(allRegionTEs, aes(x = region, y = Frequency, fill = Chromosome)) +
geom_col(position = position_dodge()) +
pubTheme + scale_fill_manual(values = pubColours) +
labs(x = "Region", y = "Frequency of TEs",
fill = "Chromosome") +
facet_grid(vars(location), space = "free")
The number of introns in genes on the X is very high, which might help explain why there are so many TEs in them?? Overall, this is just super weirdā¦
Genewise Comparisons
For this section, every point on the graph is a gene with at least one ortholog in Haplotype 1 and Haplotype 2.
Quick stats - Count
#glimpse(pairedGenes)
# Count data
pairedCount <- pairedGenes %>%
group_by(scaffH1, scaffH2, location) %>%
select(location:scaffH2, countH1, countH2) %>%
summarise(meanH1 = mean(countH1), meanH2 = mean(countH2), n = n(),
medianH1 = median(countH1), medianH2 = median(countH2),
.groups = "keep") %>%
drop_na() # still very concerned about this
#glimpse(pairedCount)
pairedGenes %>%
group_by(scaffH1, scaffH2, location) %>%
select(location:scaffH2, countH1, countH2) %>%
drop_na() %>%
pivot_longer(cols = c(countH1, countH2), names_to = "Haplotype",
values_to = "TEcount") %>%
mutate(Haplotype = case_when(Haplotype == "countH1" ~ "H1",
Haplotype == "countH2" ~ "H2")) %>%
ggplot() +
geom_violin(aes(x = scaffH2, y = TEcount, fill = Haplotype)) +
#geom_point(aes(x = scaffH2, y = TEcount, colour = Haplotype), alpha = 0.1,
# position = position_dodge2(width = 0.5)) +
pubTheme + theme(legend.position = c(0.98, 0.89)) +
scale_colour_manual(values = pubColours) +
scale_fill_manual(values = pubColours) +
labs(x = "Chromosome", y = "TE count") +
facet_wrap(vars(location), scales = "free")
# Plot Mean
ggplot(pairedCount) +
geom_point(aes(x = scaffH1, y = meanH1, size = n), colour = pubColours[1],
position = position_dodge2(width = 0.3)) +
geom_point(aes(x = scaffH2, y = meanH2, size = n), colour = pubColours[2],
position = position_dodge2(width = 0.3)) +
# Adding text to know where the ortholog is
geom_text(aes(x = scaffH1, y = meanH1, label = scaffH2), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
geom_text(aes(x = scaffH2, y = meanH2, label = scaffH1), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
pubTheme + theme(legend.position = "none") +
labs(title = "Mean TE count",x = "Scaffold", y = "Mean TE count") +
facet_wrap(vars(location), scales = "free")
# Plot median
ggplot(pairedCount) +
geom_point(aes(x = scaffH1, y = medianH1, size = n), colour = pubColours[1],
position = position_dodge2(width = 0.3)) +
geom_point(aes(x = scaffH2, y = medianH2, size = n), colour = pubColours[2],
position = position_dodge2(width = 0.3)) +
# Adding text to know where the ortholog is
geom_text(aes(x = scaffH1, y = medianH1, label = scaffH2), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
geom_text(aes(x = scaffH2, y = medianH2, label = scaffH1), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
pubTheme + theme(legend.position = "none") +
labs(title = "Median TE count",x = "Scaffold", y = "Median TE count") +
facet_wrap(vars(location), scales = "free")
Quick Stats - Frequency
Count feels like a more reasonable metric to do this with, but weāre using frequency for everything else.
pairedFreq <- pairedGenes %>%
group_by(scaffH1, scaffH2, location) %>%
select(location:scaffH2, frequencyH1, frequencyH2) %>%
summarise(meanH1 = mean(frequencyH1), meanH2 = mean(frequencyH2), n = n(),
medianH1 = median(frequencyH1), medianH2 = median(frequencyH2),
.groups = "keep") %>%
drop_na() # still very concerned about this
glimpse(pairedFreq)
## Rows: 80
## Columns: 8
## Groups: scaffH1, scaffH2, location [80]
## $ scaffH1 <chr> "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "A1", "ā¦
## $ scaffH2 <chr> "A1", "A1", "A1", "A1", "A2", "A2", "A2", "A2", "A4", "A4", "ā¦
## $ location <chr> "downstream", "exon", "intron", "upstream", "downstream", "exā¦
## $ meanH1 <dbl> 0.39596418, 0.09623622, 0.03108310, 0.39323348, 0.59815837, 0ā¦
## $ meanH2 <dbl> 0.40988306, 0.08445149, 0.03302886, 0.41140601, 0.61930317, 0ā¦
## $ n <int> 4857, 4712, 4857, 4857, 221, 189, 221, 221, 123, 107, 123, 12ā¦
## $ medianH1 <dbl> 0.373000000, 0.000000000, 0.000000000, 0.367000000, 0.7180000ā¦
## $ medianH2 <dbl> 0.38300000, 0.00000000, 0.00000000, 0.38900000, 0.74700000, 0ā¦
pairedGenes %>%
group_by(scaffH1, scaffH2, location) %>%
select(location:scaffH2, frequencyH1, frequencyH2) %>%
drop_na() %>%
pivot_longer(cols = c(frequencyH1, frequencyH2), names_to = "Haplotype",
values_to = "frequency") %>%
mutate(Haplotype = case_when(Haplotype == "densityH1" ~ "H1",
Haplotype == "densityH2" ~ "H2")) %>%
ggplot() +
geom_boxplot(aes(x = scaffH2, y = frequency, fill = Haplotype)) +
geom_point(aes(x = scaffH2, y = frequency, colour = Haplotype),
alpha = 0.02, position = "jitter") +
pubTheme + theme(legend.position = c(0.9, 0.85)) +
scale_colour_manual(values = pubColours) +
scale_fill_manual(values = pubColours) +
labs(x = "H2 Chromosome", y = "TE frequency") +
facet_wrap(vars(location), scales = "free")
# Plot Mean
ggplot(pairedFreq) +
geom_point(aes(x = scaffH1, y = meanH1, size = n), colour = pubColours[1],
position = position_dodge2(width = 0.3)) +
geom_point(aes(x = scaffH2, y = meanH2, size = n), colour = pubColours[2],
position = position_dodge2(width = 0.3)) +
# Adding text to know where the ortholog is
geom_text(aes(x = scaffH1, y = meanH1, label = scaffH2), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
geom_text(aes(x = scaffH2, y = meanH2, label = scaffH1), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
pubTheme + theme(legend.position = "none") +
labs(x = "Scaffold", y = "Mean TE frequency") +
facet_wrap(vars(location), scales = "free")
# Plot median
ggplot(pairedFreq) +
geom_point(aes(x = scaffH1, y = medianH1, size = n), colour = pubColours[1],
position = position_dodge2(width = 0.3)) +
geom_point(aes(x = scaffH2, y = medianH2, size = n), colour = pubColours[2],
position = position_dodge2(width = 0.3)) +
# Adding text to know where the ortholog is
geom_text(aes(x = scaffH1, y = medianH1, label = scaffH2), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
geom_text(aes(x = scaffH2, y = medianH2, label = scaffH1), colour = pubColours[5],
position = position_dodge2(width = 0.3), size = 3) +
pubTheme + theme(legend.position = "none") +
labs(x = "Scaffold", y = "Median TE frequency") +
facet_wrap(vars(location), scales = "free")
Orthologs on sex chromosomes
Look for a difference in TE frequency near genes with orthologs on the X and Y1/Y2.
pairedGenes %>%
filter(scaffH2 == "Y1" | scaffH2 == "Y2") %>%
filter(scaffH1 == "X") %>%
ggplot() +
geom_point(aes(x = frequencyH1, y = frequencyH2, colour = scaffH2),
alpha=0.3) +
geom_smooth(aes(x = frequencyH1, y = frequencyH2, colour = scaffH2)) +
geom_abline() +
pubTheme + scale_colour_manual(values = pubColours) +
#theme(legend.position = "none") +
labs(title = "Genes with a Y1/Y2 ortholog in Hap2 and an X ortholog in Hap1",
x = "TE frequency for H1 ortholog", y = "TE frequency for H2 ortholog",
colour = "H2 Chrom") +
facet_wrap(vars(location), nrow = 2)
pairedGenes %>%
filter(scaffH2 == "Y1" | scaffH2 == "Y2") %>%
filter(scaffH1 != "X") %>%
ggplot() +
geom_point(aes(x = frequencyH1, y = frequencyH2, colour = scaffH1),
alpha=0.3) +
geom_smooth(aes(x = frequencyH1, y = frequencyH2, colour = scaffH1)) +
geom_abline() +
pubTheme + scale_colour_manual(values = pubColours) +
#theme(legend.position = "none") +
labs(title = "Genes with a Y1/Y2 ortholog in Hap2 and autosomal ortholog in Hap1",
x = "TE frequency for H1 ortholog", y = "TE frequency for H2 ortholog",
colour = "H1 Chrom") +
ylim(0,1) +
facet_wrap(vars(location))
The results feel very confusing to me
pairedGenes %>%
filter(scaffH2 == "Y1" | scaffH2 == "Y2") %>%
filter(scaffH1 == "X") %>%
mutate(frequencyRatio = frequencyH2/frequencyH1) %>%
ungroup() %>%
select(scaffH1, scaffH2, frequencyRatio, location) %>%
#pivot_longer(cols = c(scaffH1, scaffH2), names_to = "scaff") #%>%
ggplot() +
#geom_point(aes(x = location, y = frequencyRatio)) +
geom_col(aes(x = location, y = frequencyRatio, fill = scaffH2),
position = position_dodge()) +
pubTheme + scale_colour_manual(values = pubColours) +
#theme(legend.position = "none") +
labs(title = "Genes with a Y1/Y2 ortholog in Hap2 and an X ortholog in Hap1",
x = "TE frequency for H1 ortholog", y = "Ratio of H1 TEs / H2 TEs",
colour = "H2 Chrom") #+
## Warning: Removed 6863 rows containing missing values (`geom_col()`).
facet_wrap(vars(location), nrow = 2)
## <ggproto object: Class FacetWrap, Facet, gg>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map_data: function
## params: list
## setup_data: function
## setup_params: function
## shrink: TRUE
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetWrap, Facet, gg>
Unused
Testing for genewise function
#rm(genePairs, inPairs, upPairs, allPairs, pair1, pair2, pairList)
{
gene2 <- pairList$H2[1]
#pattern2 <- paste("^ID=",gene2, sep = "")
pair1 <- H1list[[3]] %>%
filter( str_detect(attr, gene1) ) %>%
#distinct() %>%
select(scaff, start, count:density) %>%
mutate("geneH1" = gene1, "geneH2" = gene2) %>%
group_by(scaff, geneH1, geneH2) %>%
summarize(start = min(start), count = sum(count), TElen = sum(TElen),
eleLen = sum(eleLen),density = mean(density))
pair2 <- H2list[[3]] %>%
filter( str_detect(attr, gene2) ) %>%
#distinct() %>%
select(!end:attr) %>%
mutate("gene1" = gene1, "gene2" = gene2)
full_join(pair1, pair2, by = c("gene1", "gene2"))
} #small test sets
{
for (i in 1:len) {
# Save the pair's gene name for H1 and H2
gene1 <- pairList$H1[i]
gene2 <- pairList$H2[i]
#print(gene1)
# Each gene will only have one upstream region and one downstream region
for (j in 1:2) {
pair1 <- H1list[[j]] %>%
filter( str_detect(attr, gene1) ) %>%
distinct() %>%
select(scaff, start, count:density) %>%
mutate("geneH1" = gene1, "geneH2" = gene2, "location" = j)
print(pair1)
pair2 <- H2list[[j]] %>%
filter( str_detect(attr, gene2) ) %>%
distinct() %>%
select(scaff, start, count:density) %>%
mutate("geneH2" = gene2, "geneH1" = gene1, "location" = j)
print(pair2)
allPairs[[i]] <- full_join(pair1, pair2,
by = c("geneH1", "geneH2", "location"),
suffix = c("H1", "H2"))
} #end upstream/downstream search (allPairs[[1]] and allPairs[[2]] done)
# There may be several exons and introns per gene - sum/average the values
for (j in 3:4) {
pair1 <- H1list[[j]] %>%
filter( str_detect(attr, gene1) ) %>%
distinct() %>%
select(scaff, start, count:density) %>%
mutate("geneH1" = gene1, "geneH2" = gene2, "location" = j) %>%
group_by(scaff, geneH1, geneH2, location) %>%
summarize(start = robustMin(start), count = sum(count),
TElen = sum(TElen), eleLen = sum(eleLen),
density = mean(density), .groups = "keep")
print(pair1)
pair2 <- H2list[[j]] %>%
filter( str_detect(attr, gene2) ) %>%
distinct() %>%
select(scaff, start, count:density) %>%
mutate("geneH2" = gene2, "geneH1" = gene1, "location" = j) %>%
group_by(scaff, geneH1, geneH2, location) %>%
summarize(start = robustMin(start), count = sum(count),
TElen = sum(TElen), eleLen = sum(eleLen),
density = mean(density), .groups = "keep")
print(pair2)
allPairs[[j]][[i]] <- full_join(pair1, pair2,
by = c("geneH1", "geneH2", "location"),
suffix = c("H1", "H2"))
} #end exon/intron search
} #end gene pair search
} #iterating over H1list and H2list
{
upPair1 <- H1list[[1]] %>%
filter( str_detect(attr, gene1) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count1" = count, "TElen1" = TElen,
"eleLen1" = eleLen, "density1" = density) %>%
select(!count:density)
downPair1 <- H1list[[2]] %>%
filter( str_detect(attr, gene1) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count1" = count, "TElen1" = TElen,
"eleLen1" = eleLen, "density1" = density) %>%
select(!count:density)
exPair1 <- H1list[[3]] %>%
filter( str_detect(attr, gene1) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count1" = count, "TElen1" = TElen,
"eleLen1" = eleLen, "density1" = density) %>%
select(!count:density)
inPair1 <- H1list[[4]] %>%
filter( str_detect(attr, gene1) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count1" = count, "TElen1" = TElen,
"eleLen1" = eleLen, "density1" = density) %>%
select(!count:density)
upPair2 <- H2list[[1]] %>%
filter( str_detect(attr, gene2) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count2" = count, "TElen2" = TElen,
"eleLen2" = eleLen, "density2" = density) %>%
select(!count:density)
downPair2 <- H2list[[2]] %>%
filter( str_detect(attr, gene2) ) %>%
select(scaff, count:density) %>%
mutate("geneH1" = gene1, "geneH2" = gene2,
"countH2" = count, "TElenH2" = TElen,
"eleLenH2" = eleLen, "densityH2" = density) %>%
select(!count:density)
exPair2 <- H2list[[3]] %>%
filter( str_detect(attr, gene2) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count2" = count, "TElen2" = TElen,
"eleLen2" = eleLen, "density2" = density) %>%
select(!count:density)
inPair2 <- H2list[[4]] %>%
filter( str_detect(attr, gene2) ) %>%
select(scaff, count:density) %>%
mutate("gene1" = gene1, "gene2" = gene2,
"count2" = count, "TElen2" = TElen,
"eleLen2" = eleLen, "density2" = density) %>%
select(!count:density)
upPairs <- full_join(upPair1, upPair2, by = c("scaff", "gene1", "gene2"))
downPairs <- full_join(upPair1, upPair2, by = c("scaff", "gene1", "gene2"))
exPairs <- full_join(exPair1, exPair2, by = c("scaff", "gene1", "gene2"))
inPairs <- full_join(inPair1, inPair2, by = c("scaff", "gene1", "gene2"))
} # too many things
Different approach to summing TE bp in different regions
#%>%
group_by(seqOnt, region) %>%
summarize(summedbp = sum(bp)) %>%
mutate(X = case_when(
region == "PAR1" ~ summedbp/71000000,
region == "Old" ~ summedbp/(261000000-71000000),
region == "New" ~ summedbp/(367000000-261000000),
region == "PAR2" ~ summedbp/(483000000-367000000)
))
%>%
group_by(seqOnt, region) %>%
summarize(summedbp = sum(bp)) %>%
mutate(Y1 = case_when(
region == "New" ~ summedbp/64000000,
region == "Old" ~ summedbp/(272000000-64000000),
region == "PAR1" ~ summedbp/(343000000-272000000)
))
%>%
group_by(seqOnt, region) %>%
summarize(summedbp = sum(bp)) %>%
mutate(Y2 = case_when(
region == "PAR2" ~ summedbp/121000000,
region == "New" ~ summedbp/(159000000-121000000),
region == "Old" ~ summedbp/(348000000-159000000)
))
allSummedTEs <- full_join(XsummedTEs, Y1summedTEs,
by = join_by(seqOnt, region)) %>%
full_join(Y2summedTEs, by = join_by(seqOnt, region)) %>%
pivot_longer(cols = c(X, Y1, Y2),
names_to = "Chromosome", values_to = "bp")
NC female data
# NC female
#NCfemList <- importFiles("NCfem")
#femCounts <- windowAndMerge(NCfemList, NCfemScaffs)
#femCounts %>%
# ggplot(aes(winnum)) +
# geom_line(aes(y = upAvg, colour = "Upstream")) +
# geom_line(aes(y = downAvg, colour = "Downstream")) +
# geom_line(aes(y = exonAvg, colour = "Exon")) +
# geom_line(aes(y = intronAvg, colour = "Intron")) +
# pubTheme + theme(legend.position=c(0.87,0.5)) +
# labs(x = "NCfem Position (1Mb windows)",
# y = "Average TE Frequency in this Window",
# colour = "Location") +
# facet_grid( vars(scaff), scale = "free_x", space = "free" )
#femCounts %>%
# ggplot(aes(winnum)) +
# geom_line(aes(y = upSum, colour = "Upstream")) +
# geom_line(aes(y = downSum, colour = "Downstream")) +
# geom_line(aes(y = exonSum, colour = "Exon")) +
# geom_line(aes(y = intronSum, colour = "Intron")) +
# pubTheme + theme(legend.position=c(0.87,0.5)) +
# labs(x = "NCfem Position (1Mb windows)",
# y = "Windowed TE count",
# colour = "Location") +
# facet_grid( vars(scaff), scale = "free_x", space = "free" )
# limited Y axis
#femCounts %>%
# ggplot(aes(winnum)) +
# geom_line(aes(y = upSum, colour = "Upstream")) +
# geom_line(aes(y = downSum, colour = "Downstream")) +
# geom_line(aes(y = exonSum, colour = "Exon")) +
# geom_line(aes(y = intronSum, colour = "Intron")) +
# pubTheme + theme(legend.position=c(0.87,0.5)) +
# labs(x = "NCfem Position (1Mb windows)",
# y = "Windowed TE count",
# colour = "Location") +
# facet_grid( vars(scaff), scale = "free_x", space = "free" )
femCounts %>%
ggplot(aes(winnum)) +
geom_line(aes(y = upSum/upNum, colour = "Upstream")) +
geom_line(aes(y = downSum/downNum, colour = "Downstream")) +
geom_line(aes(y = exonSum/exonNum, colour = "Exon")) +
# geom_line(aes(y = intronSum/intronNum, colour = "Intron")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
# ylim(0, 30) +
labs(x = "NCfem Position (1Mb windows)",
y = "Windowed count divided by feature count",
colour = "Location") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )
femCounts %>%
select(c(scaff, winnum, intronNum, intronSum, intronAvg)) %>%
ggplot(aes(x = winnum)) +
geom_line(aes(y = intronNum, colour = "Introns")) +
geom_line(aes(y = intronSum, colour = "TEs in Introns")) +
pubTheme + theme(legend.position=c(0.87,0.5)) +
ylim(0, 500) +
labs(x = "NCfem Position (1Mb windows)", y = "Average in Window",
colour = "Feature Type") +
facet_grid( vars(scaff), scale = "free_x", space = "free" )